/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

/*
 * bootcache reads the block trace taken during boot and
 * makes a boot cache from it.
 *
 * bootcache should be run after the system has booted
 * including bringing up chrome and login. Works in
 * conjunction with dm-bootcache device mapper to coalesce
 * the blocks used during boot.
 *
 * Sizes and offsets are measured in 512 byte sectors.
 * Space is allocated in chunks. The size of chunks is
 * derived from alignment restrictions obtained from
 * the header.
 *
 * bootcache [-t] <device-name> <raw-partition>
 *
 *   -t - for testing - looks in a different place for
 *        information files.
 *
 *   <device-name> e.g. dm-0. Device name without /dev/
 *        prefix.
 *
 * Files:
 * 1. Device - <raw-partition> - Where the blocks to be
 *             cached are stored. Both the original and
 *             cached copy.
 * 2. Header - /sys/kernel/debug/dm-bootcache/dm-0/header
 *             Header for the boot cache. It contains the
 *             information the bootcache utility will need
 *             to create the bootcache.
 * 3. Trace  - /sys/kernel/debug/dm-bootcache/dm-0/blocktrace
 *             Trace of files read during boot
 * 4. Valid  - /sys/kernel/debug/dm-bootcache/dm-0/valid
 *             Returns "1" if cache is valid
 * 5. Free   - /sys/kernel/debug/dm-bootcache/dm-0/free
 *             Write "1" to this file to free all the
 *             boot cache data including traces
 */

#define _XOPEN_SOURCE 600  /* Enable pread/pwrite/posix_memalign */

#include <sys/types.h>
#include <sys/stat.h>
#include <sys/user.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syslog.h>
#include <unistd.h>

#include "dm-bootcache.h"

typedef uint64_t u64;
typedef uint32_t u32;

#define SECTOR_SHIFT 9
#define MAX_CHUNKS 128
#define MAX_FILE_NAME 256
#define MAX_MSG 1024

static struct bootcache_hdr Header;
static struct {
	struct bootcache_trace *tr;
	int num;
} Trace;

static const char Progname[] = "bootcache";

static char Valid_file[MAX_FILE_NAME];
static char Free_file[MAX_FILE_NAME];
static char Header_file[MAX_FILE_NAME];
static char Blocktrace_file[MAX_FILE_NAME];

static u64 Trace_start;
static u64 Cache_start;
static u64 Chunk_size;
static u64 Sectors_per_chunk;

#define fatal(fmt, ...)	pr_fatal(__FILE__, __FUNCTION__, __LINE__, \
				fmt, ## __VA_ARGS__)

/*
 * pr_fatal:  Because bootcache is not critical to
 * the running of the system, we only print what
 * happened and exit.
 */
__attribute__ ((__format__ (__printf__, 4, 5)))
static void pr_fatal(
	const char *file,
	const char *func,
	int line,
	const char *fmt, ...)
{
	char msg[MAX_MSG];
	va_list args;
	int n = MAX_MSG;
	int i = 0;
	int r;

	fflush(stdout);
	r = snprintf(msg, n, "%s %s:%d:%s ", Progname, file, line, func);
	n -= r;
	i += r;
	if (n && fmt) {
		va_start(args, fmt);
		r = vsnprintf(&msg[i], n, fmt, args);
		n -= r;
		i += r;
		va_end(args);

		if (n && fmt[0] != '\0' && fmt[strlen(fmt)-1] == ':') {
			snprintf(&msg[i], n, " %s (errno=%d)", strerror(errno), errno);
		}
	}
	syslog(LOG_ERR, "%s\n", msg);
	exit(2); /* conventional value for failed execution */
}

static void *emalloc(size_t n)
{
	void *p;

	p = malloc(n);
	if (p == NULL) {
		fatal("malloc of %zu bytes failed:", n);
	}
	return p;
}

static int eopen(const char *file, int flags)
{
	int fd;

	fd = open(file, flags);
	if (fd == -1) {
		fatal("open %s:", file);
	}
	return fd;
}

static int efsync(int fd)
{
	int rc;

	rc = fsync(fd);
	if (rc == -1) {
		fatal("fsync:");
	}
	return rc;
}

static int eclose(int fd)
{
	int rc;

	rc = close(fd);
	if (rc == -1) {
		fatal("close:");
	}
	return rc;
}

static void *malloc_buf(size_t nchunks)
{
	void *buf;
	int rc;

	rc = posix_memalign(&buf, Chunk_size, nchunks * Chunk_size);
	if (rc) {
		fatal("posix_memalign rc=%d", rc);
	}
	return buf;
}

static u64 num_sectors_in_cache(void)
{
	int i;
	u64 sum = 0;

	for (i = 0; i < Trace.num; i++) {
		sum += Trace.tr[i].count;
	}
	return sum;
}

static u64 num_meta_sectors(void)
{
	u64 num_bytes = Trace.num * sizeof(*Trace.tr);

	/* Align to page boundary then convert to sectors */
	return ((num_bytes + Chunk_size - 1) / Chunk_size) * Sectors_per_chunk;
}

static void compute_sections(void)
{
	Header.num_trace_recs = Trace.num;
	Header.sectors_meta = num_meta_sectors();
	Header.sectors_data = num_sectors_in_cache();
	Trace_start = Header.sector + Sectors_per_chunk;
	Cache_start = Trace_start + Header.sectors_meta;
}

static void copy_trace(int dst, int src, struct bootcache_trace tr, void *buf)
{
	u64 n;
	u64 remainder;
	u64 offset;
	int rc;

	offset    = tr.sector << SECTOR_SHIFT;
	remainder = tr.count << SECTOR_SHIFT;
	n = MAX_CHUNKS * Chunk_size;
	while (remainder) {
		if (n > remainder) {
			n = remainder;
		}
		rc = pread(src, buf, n, offset);
		if (rc < 0) {
			fatal("pread trace offset=%"PRIu64" num sectors=%"PRIu64":",
				offset >> SECTOR_SHIFT, n >> SECTOR_SHIFT);
		}
		if (rc != n) {
			fatal("pread read only %u bytes expected %"PRIu64,
				rc, n);
		}
		rc = write(dst, buf, n);
		if (rc < 0) {
			fatal("write trace offset=%"PRIu64" num sectors=%"PRIu64":",
				offset >> SECTOR_SHIFT, n >> SECTOR_SHIFT);
		}
		if (rc != n) {
			fatal("write wrote only %u bytes expected %"PRIu64,
				rc, n);
		}
		offset += n;
		remainder -= n;
	}
}

static void copy_blocks(const char *device)
{
	int i;
	off_t rc;

	int src = open(device, O_RDONLY);
	int dst = open(device, O_WRONLY);
	void *buf = malloc_buf(MAX_CHUNKS);

	rc = lseek(dst, Cache_start << SECTOR_SHIFT, SEEK_SET);
	if (rc == -1) {
		fatal("lseek for cache start:");
	}
	for (i = 0; i < Trace.num; i++) {
		copy_trace(dst, src, Trace.tr[i], buf);
	}
	free(buf);
	efsync(dst);
	eclose(dst);
	eclose(src);
}

static void dump_trace()
{
	struct bootcache_trace *tr = Trace.tr;
	int i;

	if (0) {
		for (i = 0; i < Trace.num; i++, tr++) {
			printf("%"PRIu64" %"PRIu64" %"PRIu64"\n",
				(uint64_t)tr->sector,
				(uint64_t)tr->count,
				(uint64_t)tr->ino);
		}
	}
}

/*
 * Because we are reading a pseudo file in sysfs,
 * we scan it to see how big it is.
 */
static u64 num_bytes(const char *file)
{
	char buf[Chunk_size];
	ssize_t rc;
	u64 sum = 0;

	int fd = eopen(file, O_RDONLY);
	for (;;) {
		rc = read(fd, buf, sizeof(buf));
		if (rc == -1)
			fatal("read %s:", file);
		if (rc == 0)
			break;
		sum += rc;
	}
	eclose(fd);
	return sum;
}

static void read_trace(const char *file)
{
	/*
	 * Because this is a sysfs file, we have to read it to get
	 * its size. Even if more data is appended to the file, we
	 * don't care, we just want the data up to this point in
	 * time.
	 */
	u64 n = num_bytes(file);
	ssize_t rc;
	int fd;
	char *b;

	Trace.tr  = emalloc(n);
	Trace.num = n / sizeof(struct bootcache_trace);
	fd = eopen(file, O_RDONLY);
	/*
	 * Because sysfs only returns a page at a time,
	 * will need to do the read in a loop.
	 */
	for (b = (char *)Trace.tr; n; n -= rc, b += rc) {
		rc = read(fd, b, n);
		if (rc == -1) {
			fatal("read %s:", file);
		}
		if (rc == 0) {
			fatal("trying to read %"PRIu64" bytes", n);
		}
	}
	dump_trace();
	eclose(fd);
}

static void read_header(const char *file)
{
	int fd;
	int rc;

	fd = eopen(file, O_RDONLY);
	rc = read(fd, &Header, sizeof(Header));
	if (rc == -1) {
		fatal("read %s:", file);
	}
	eclose(fd);
	if (Header.magic != BOOTCACHE_MAGIC) {
		fatal("Bad magic %u != %u", Header.magic, BOOTCACHE_MAGIC);
	}
	if (Header.version != BOOTCACHE_VERSION) {
		fatal("Bad version %u != %u", Header.version, BOOTCACHE_VERSION);
	}
	Chunk_size = Header.alignment;
	Sectors_per_chunk = Chunk_size >> SECTOR_SHIFT;
}

/*
 * The header is written last after everything else, cache data and traces,
 * have been written to the disk. The header is what tells the boot cache
 * on the next boot that the cache is valid and should be used.
 * For correctness, we don't have to flush the header but the default
 * flush time is 10 minutes and there is no reason to wait.
 */
static void write_header(const char *file)
{
	int fd;
	int rc;

	fd = eopen(file, O_WRONLY);
	rc = pwrite(fd, &Header, sizeof(Header), Header.sector << SECTOR_SHIFT);
	if (rc != sizeof(Header)) {
		fatal("pwrite %s rc=%d:", file, rc);
	}
	efsync(fd);
	eclose(fd);
}

static void write_trace(const char *file)
{
	int fd;
	ssize_t rc;
	ssize_t size = Trace.num * sizeof(*Trace.tr);

	fd = eopen(file, O_WRONLY);
	rc = pwrite(fd, Trace.tr, size, Trace_start << SECTOR_SHIFT);
	if (rc != size) {
		fatal("pwrite %s rc=%zd size=%zd:", file, rc, size);
	}
	efsync(fd);
	eclose(fd);
}

/*
 * Writing '1' to the free file indicates to
 * the bootcache that it can free all of its
 * resources.
 */
void free_bootcache(const char *file)
{
	char buf[] = "1";
	int fd;
	int rc;

	fd = eopen(file, O_WRONLY);
	rc = write(fd, buf, 1);
	if (rc == -1) {
		fatal("write %s:", file);
	}
	eclose(fd);
}

/*
 * A '1' in the first byte of the valid file, indicates, the
 * cache is valid. Otherwise is should be '0';
 */
static bool is_valid(const char *file)
{
	char buf[1];
	int fd;
	int rc;

	fd = eopen(file, O_RDONLY);
	rc = read(fd, buf, sizeof(buf));
	eclose(fd);
	if ((rc == -1) || (rc == 0)) {
		fatal("read %s:", file);
	}
	return buf[0] == '1';
}

static void gen_file_name(char *file_name, int size, const char *fmt,
				const char *prefix, const char *name)
{
	int rc;

	rc = snprintf(file_name, size, fmt, prefix, name);
	if (rc >= size) {
		fatal("Name too long %s", name);
	}
}

static void gen_file_names(const char *fmt, const char *device_mapper)
{
	gen_file_name(Valid_file, sizeof(Valid_file),
			fmt, device_mapper, "valid");
	gen_file_name(Free_file, sizeof(Free_file),
			fmt, device_mapper, "free");
	gen_file_name(Header_file, sizeof(Header_file),
			fmt, device_mapper, "header");
	gen_file_name(Blocktrace_file, sizeof(Blocktrace_file),
			fmt, device_mapper, "blocktrace");
}

static void usage(void)
{
	fprintf(stderr, "Usage: %s [-t]"
			" <device mapper> <raw partition>\n"
			"  e.g %s dm-0 /dev/sda3\n",
			Progname, Progname);
	exit(2);
}

int main(int argc, char *argv[])
{
	char *device_mapper = NULL;
	char *raw_partition = NULL;

	openlog(Progname, LOG_PERROR | LOG_CONS | LOG_PID, 0);
	syslog(LOG_ERR, "started\n");
	for (;;) {
		int c;

		c = getopt(argc, argv, "?");
		if (c == -1)
			break;
		switch (c) {
		case '?':
		default:
			usage();
			break;
		}
	}
	if (optind+2 != argc) {
		usage();
	}
	device_mapper = argv[optind];
	raw_partition = argv[optind + 1];
	gen_file_names("/sys/devices/virtual/block/%s/dm/%s",
			device_mapper);
	if (!is_valid(Valid_file)) {
		/*
		 * Rebuild the bootcache
		 */
		read_header(Header_file);
		read_trace(Blocktrace_file);
		compute_sections();
		copy_blocks(raw_partition);
		write_trace(raw_partition);
		write_header(raw_partition);
	}
	free_bootcache(Free_file);
	syslog(LOG_ERR, "done\n");
	return 0;
}
